import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
import folium
import seaborn as sns
# Postcode data (SSI = small scale installations, SGU = small generation unit, SWH = Solar water heating )
df_SSI_SGU_Hydro = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SGU-Hydro.csv")
df_SSI_SGU_Solar = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SGU-Solar.csv")
df_SSI_SGU_Wind = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SGU-Wind.csv")
df_SSI_SWH_Air_source_heat_pump = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SWH-Air-source-heat-pump.csv")
df_SSI_SWH_Solar = pd.read_csv("data/PostCode Data/Postcode data for small-scale installations - SWH-Solar.csv")
# Battery installations with SSI by state
df_battery_installations = pd.read_csv("data/PostCode Data/State data for battery installations with small-scale systems.csv")
# Australian postcode data reference dataset
df_AU_postcode_dataset = pd.read_excel("data/australian_postcodes.xlsx")
# Whole REC database (REC = renewable energy certificate)
df_REC_database = pd.read_csv("data/wholeRECdatabase_20230120_no_dups.csv",encoding='cp1252')
df_list = [
df_SSI_SGU_Hydro,
df_SSI_SGU_Solar,
df_SSI_SGU_Wind,
df_SSI_SWH_Air_source_heat_pump,
df_SSI_SWH_Solar,
df_battery_installations,
df_AU_postcode_dataset,
df_REC_database
]
for idx, df in enumerate(df_list, start=1):
print(f"DataFrame {idx} - Name: {df.name if hasattr(df, 'name') else 'Unnamed'}, Shape: {df.shape}")
DataFrame 1 - Name: Unnamed, Shape: (20, 7) DataFrame 2 - Name: Unnamed, Shape: (2806, 41) DataFrame 3 - Name: Unnamed, Shape: (253, 5) DataFrame 4 - Name: Unnamed, Shape: (2610, 21) DataFrame 5 - Name: Unnamed, Shape: (2964, 21) DataFrame 6 - Name: Unnamed, Shape: (10, 10) DataFrame 7 - Name: Unnamed, Shape: (18513, 33) DataFrame 8 - Name: Unnamed, Shape: (6271416, 17)
df_REC_database.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6271416 entries, 0 to 6271415 Data columns (total 17 columns): # Column Dtype --- ------ ----- 0 Status object 1 Owner object 2 Accreditation_Code object 3 Fuel_Source_Active bool 4 Fuel_Source_Type object 5 Fuel_Source_Display_Name object 6 Generation_Year int64 7 Status.1 object 8 Start_Serial float64 9 End_Serial float64 10 State object 11 Owner_Name object 12 Creation_Year int64 13 Range_ID int64 14 Creation_Date object 15 Public_Registered_Person_ID int64 16 Created_By object dtypes: bool(1), float64(2), int64(4), object(10) memory usage: 771.5+ MB
df_compare_systems = pd.DataFrame([], columns=['Type','Locations','System Quantity', 'KW'])
sgu_hydro = df_SSI_SGU_Hydro["Small Unit Installation Postcode"].drop_duplicates()
postcodes = sgu_hydro.shape[0]
quantity = sum(df_SSI_SGU_Hydro["Previous Years (2001-2021) - Installation Quantity"])
kw = round(sum(df_SSI_SGU_Hydro["Previous Years (2001-2021) - SGU Output Rated Output In kW"]),2)
print(postcodes, "unique hydro postcodes")
print(quantity, "total hydro units")
print(kw , "total hydro kw")
hydro_df = pd.DataFrame([["Hydro",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, hydro_df])
df_compare_systems
20 unique hydro postcodes 20 total hydro units 50.7 total hydro kw
| Type | Locations | System Quantity | KW | |
|---|---|---|---|---|
| 0 | Hydro | 20 | 20 | 50.7 |
sgu_solar = df_SSI_SGU_Solar["Small Unit Installation Postcode"].drop_duplicates()
postcodes = sgu_solar.shape[0]
quantity = sum(df_SSI_SGU_Solar["Previous Years (2001-2021) - Installation Quantity"])
kw = round(sum(df_SSI_SGU_Solar["Previous Years (2001-2021) - SGU Output Rated Output In kW"]),2)
print(postcodes, "unique solar postcodes")
print(quantity, "total solar units")
print(kw , "total solar kw")
solar_df = pd.DataFrame([["Solar",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, solar_df])
df_compare_systems
2806 unique solar postcodes 3073708 total solar units 16519611.88 total solar kw
| Type | Locations | System Quantity | KW | |
|---|---|---|---|---|
| 0 | Hydro | 20 | 20 | 50.70 |
| 0 | Solar | 2806 | 3073708 | 16519611.88 |
sgu_wind = df_SSI_SGU_Wind["Small Unit Installation Postcode"].drop_duplicates()
postcodes = sgu_wind.shape[0]
quantity = sum(df_SSI_SGU_Wind["Previous Years (2001-2021) - Installation Quantity"])
kw = round(sum(df_SSI_SGU_Wind["Previous Years (2001-2021) - SGU Output Rated Output In kW"]),2)
print(postcodes, "unique wind postcodes")
print(quantity, "total wind units")
print(kw , "total wind kw")
wind_df = pd.DataFrame([["Wind",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, wind_df])
df_compare_systems
253 unique wind postcodes 424 total wind units 1468.67 total wind kw
| Type | Locations | System Quantity | KW | |
|---|---|---|---|---|
| 0 | Hydro | 20 | 20 | 50.70 |
| 0 | Solar | 2806 | 3073708 | 16519611.88 |
| 0 | Wind | 253 | 424 | 1468.67 |
swh_hp = df_SSI_SWH_Air_source_heat_pump["Small Unit Installation Postcode"].drop_duplicates()
postcodes = swh_hp.shape[0]
quantity = sum(df_SSI_SWH_Air_source_heat_pump["Previous Years (2001-2021) - Installation Quantity"])
kw = 0 # no kw column
print(postcodes, "unique heat-pump postcodes")
print(quantity, "total heat-pump units")
print(kw , "total heat-pump kw")
hp_df = pd.DataFrame([["Heat-pump",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, hp_df])
df_compare_systems
2610 unique heat-pump postcodes 365635 total heat-pump units 0 total heat-pump kw
| Type | Locations | System Quantity | KW | |
|---|---|---|---|---|
| 0 | Hydro | 20 | 20 | 50.70 |
| 0 | Solar | 2806 | 3073708 | 16519611.88 |
| 0 | Wind | 253 | 424 | 1468.67 |
| 0 | Heat-pump | 2610 | 365635 | 0.00 |
swh_solar = df_SSI_SWH_Solar["Small Unit Installation Postcode"].drop_duplicates()
postcodes = swh_solar.shape[0]
quantity = sum(df_SSI_SWH_Solar["Previous Years (2001-2021) - Installation Quantity"])
kw = 0 # no kw column
print(postcodes, "unique swh-solar postcodes")
print(quantity, "total swh-solar units")
print(kw , "total swh-solar kw")
swh_solar_df = pd.DataFrame([["SWH-Solar",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, swh_solar_df])
df_compare_systems
2964 unique swh-solar postcodes 1036390 total swh-solar units 0 total swh-solar kw
| Type | Locations | System Quantity | KW | |
|---|---|---|---|---|
| 0 | Hydro | 20 | 20 | 50.70 |
| 0 | Solar | 2806 | 3073708 | 16519611.88 |
| 0 | Wind | 253 | 424 | 1468.67 |
| 0 | Heat-pump | 2610 | 365635 | 0.00 |
| 0 | SWH-Solar | 2964 | 1036390 | 0.00 |
df_compare_systems[["Locations","System Quantity","KW"]] = df_compare_systems[["Locations","System Quantity","KW"]].apply(pd.to_numeric)
Locations_graph = df_compare_systems.plot.bar(x='Type', y='Locations', rot=0)
System_Quantity_graph = df_compare_systems.plot.bar(x='Type', y='System Quantity', rot=0)
kW_graph = df_compare_systems.plot.bar(x='Type', y='KW', rot=0)
1- Do swh units not produce KW or is it simply not included in the datset?
2- Is the observed KW value already calculated based on the system quantity, or should the two be multiplied to find the overall kw production per postcode?
1- Most locations that have solar panels will also have a solar water heating which makes intuitive sense.
2- In a given household you may have 1 heat pump and 1 SWH-solar but many panels which explains the quantity difference.
3- It's very rare for residential properties to have wind or hydro which explains the low numbers compared to solar.
4- Between 2001 and 2021 (The last 20 years), Australian residents installed around 16 Million kW of solar energy, 1500kW of Wind energy, and 50kW of hydro energy.
Creating a combined postcode dataset
swh_solar_df = pd.DataFrame([["SWH-Solar",postcodes, quantity, kw]],columns=['Type', 'Locations','System Quantity', 'KW'])
df_compare_systems = pd.concat([df_compare_systems, swh_solar_df])
df_SSI_SGU_Hydro_small = df_SSI_SGU_Hydro[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity','Previous Years (2001-2021) - SGU Output Rated Output In kW']]
df_SSI_SGU_Solar_small = df_SSI_SGU_Solar[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity','Previous Years (2001-2021) - SGU Output Rated Output In kW']]
df_SSI_SGU_Wind_small = df_SSI_SGU_Wind[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity','Previous Years (2001-2021) - SGU Output Rated Output In kW']]
df_SSI_SWH_Air_source_heat_pump_small = df_SSI_SWH_Air_source_heat_pump[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity']]
df_SSI_SWH_Solar_small = df_SSI_SWH_Solar[['Small Unit Installation Postcode','Previous Years (2001-2021) - Installation Quantity']]
df_SSI_SGU_Hydro_small2= df_SSI_SGU_Hydro_small.copy(deep=True)
df_SSI_SGU_Solar_small2= df_SSI_SGU_Solar_small.copy(deep=True)
df_SSI_SGU_Wind_small2= df_SSI_SGU_Wind_small.copy(deep=True)
df_SSI_SWH_Air_source_heat_pump_small2= df_SSI_SWH_Air_source_heat_pump_small.copy(deep=True)
df_SSI_SWH_Solar_small2= df_SSI_SWH_Solar_small.copy(deep=True)
df_SSI_SWH_Air_source_heat_pump_small2["Previous Years (2001-2021) - SGU Output Rated Output In kW"] = None
df_SSI_SWH_Solar_small2["Previous Years (2001-2021) - SGU Output Rated Output In kW"] = None
df_SSI_SGU_Hydro_small2['Type'] = "Hydro"
df_SSI_SGU_Solar_small2['Type'] ="Solar"
df_SSI_SGU_Wind_small2['Type'] = "Wind"
df_SSI_SWH_Air_source_heat_pump_small2['Type'] = "SWG- Heat Pump"
df_SSI_SWH_Solar_small2['Type'] = "SWH Solar"
df_systems_combined = pd.concat([df_SSI_SGU_Hydro_small2,df_SSI_SGU_Solar_small2,df_SSI_SGU_Wind_small2,df_SSI_SWH_Air_source_heat_pump_small2,df_SSI_SWH_Solar_small2])
Cleaning up the Australian postcode reference dataset
df_AU_postcode_dataset_small = df_AU_postcode_dataset[['Postcode','State','Long','Lat','LGA Region','Electorate Rating']]
df_AU_postcode_dataset_small = df_AU_postcode_dataset_small.drop_duplicates(subset='Postcode', keep="first")
df_AU_postcode_dataset_small
| Postcode | State | Long | Lat | LGA Region | Electorate Rating | |
|---|---|---|---|---|---|---|
| 0 | 200 | ACT | 149.119000 | -35.277700 | Unincorporated ACT | NaN |
| 2 | 800 | NT | 130.836680 | -12.458684 | Darwin Waterfront Precinct | Inner Metropolitan |
| 4 | 801 | NT | 130.836680 | -12.458684 | Darwin Waterfront Precinct | Rural |
| 5 | 803 | NT | 0.000000 | 0.000000 | NaN | NaN |
| 6 | 804 | NT | 130.873315 | -12.428017 | Darwin | NaN |
| ... | ... | ... | ... | ... | ... | ... |
| 18508 | 9013 | QLD | 152.823141 | -27.603479 | Ipswich | NaN |
| 18509 | 9015 | QLD | 152.823141 | -27.603479 | Ipswich | NaN |
| 18510 | 9464 | QLD | 153.074982 | -27.397055 | Brisbane | NaN |
| 18511 | 9726 | QLD | 153.412197 | -28.008783 | Gold Coast | NaN |
| 18512 | 9999 | VIC | 144.956776 | -37.817403 | Melbourne | NaN |
3173 rows × 6 columns
Merge in the postcode reference data with combined postcode dataset
df_systems_combined_extra = df_systems_combined.merge(df_AU_postcode_dataset_small, how='left', left_on='Small Unit Installation Postcode', right_on='Postcode')
df_systems_combined_extra.isnull().sum()
Small Unit Installation Postcode 0 Previous Years (2001-2021) - Installation Quantity 0 Previous Years (2001-2021) - SGU Output Rated Output In kW 5574 Type 0 Postcode 217 State 217 Long 217 Lat 217 LGA Region 218 Electorate Rating 1021 dtype: int64
We expect kW output to have null values but not other merged columns which all suspiciously have 217 null values
df_systems_combined_extra[df_systems_combined_extra.Postcode.isnull()]
| Small Unit Installation Postcode | Previous Years (2001-2021) - Installation Quantity | Previous Years (2001-2021) - SGU Output Rated Output In kW | Type | Postcode | State | Long | Lat | LGA Region | Electorate Rating | |
|---|---|---|---|---|---|---|---|---|---|---|
| 20 | 0 | 4 | 4.46 | Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 101 | 1848 | 1 | 1.04 | Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 510 | 2552 | 1 | 0.56 | Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 1502 | 4062 | 1 | 1.50 | Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 1503 | 4063 | 1 | 1.25 | Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8645 | 7919 | 1 | NaN | SWH Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 8646 | 7920 | 1 | NaN | SWH Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 8648 | 8084 | 1 | NaN | SWH Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 8650 | 9072 | 1 | NaN | SWH Solar | NaN | NaN | NaN | NaN | NaN | NaN |
| 8651 | 9156 | 1 | NaN | SWH Solar | NaN | NaN | NaN | NaN | NaN | NaN |
217 rows × 10 columns
After inspecting these 217 rows, we find that they don't correspond to legitimate postcodes and seem to be a data mistake caused by the author of the dataset, likely by prefilling consecutive postcode values.
Therefore we can remove these 217 rows
df_systems_combined_cleaned = df_systems_combined_extra[~df_systems_combined_extra.Postcode.isnull()]
df_systems_combined_cleaned
#df_systems_combined_cleaned.to_excel("Output2.xlsx")
| Small Unit Installation Postcode | Previous Years (2001-2021) - Installation Quantity | Previous Years (2001-2021) - SGU Output Rated Output In kW | Type | Postcode | State | Long | Lat | LGA Region | Electorate Rating | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 837 | 1 | 4.00 | Hydro | 837.0 | NT | 131.016647 | -12.704767 | Litchfield | Rural |
| 1 | 2484 | 1 | 1.90 | Hydro | 2484.0 | NSW | 153.316480 | -28.380590 | Tweed | Rural |
| 2 | 2527 | 1 | 4.00 | Hydro | 2527.0 | NSW | 150.754592 | -34.583539 | Shellharbour | Provincial |
| 3 | 3409 | 1 | 1.05 | Hydro | 3409.0 | VIC | 141.863237 | -36.724843 | Horsham | Rural |
| 4 | 3691 | 1 | 1.00 | Hydro | 3691.0 | VIC | 147.880991 | -36.027154 | Towong | Rural |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8642 | 7469 | 2 | NaN | SWH Solar | 7469.0 | TAS | 145.201632 | -41.784391 | Waratah-Wynyard | Rural |
| 8643 | 7470 | 3 | NaN | SWH Solar | 7470.0 | TAS | 145.500566 | -41.902129 | West Coast | Rural |
| 8647 | 8002 | 1 | NaN | SWH Solar | 8002.0 | VIC | 144.982207 | -37.818517 | Yarra | NaN |
| 8649 | 9000 | 1 | NaN | SWH Solar | 9000.0 | QLD | 152.823141 | -27.603479 | Ipswich | NaN |
| 8652 | 9726 | 2 | NaN | SWH Solar | 9726.0 | QLD | 153.412197 | -28.008783 | Gold Coast | NaN |
8436 rows × 10 columns
m = folium.Map(location=[-38.2744, 140.7751], zoom_start=4)
for index, row in df_systems_combined_cleaned.iterrows():
folium.Circle(
location=[row['Lat'], row['Long']],
popup= 'Postcode:' +str(row['Postcode']),
tooltip=row['Postcode'],
radius= 4000,
color='green',
fill=True,
fill_color='black'
).add_to(m)
m